library("cluster")
library("dendextend")
##
## ---------------------
## Welcome to dendextend version 1.14.0
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
##
## cutree
source("functions.R")
## Loading required package: ggplot2
# Get data with Stylo
# data = stylo::load.corpus.and.parse(corpus.dir = "../dh-meier-data/output/transkribus/tokenized/boudams/", features = "w", ngram.size = 1, preserve.case = FALSE)
# Get freq lists
#data = stylo::make.table.of.frequencies(corpus = data, features = unique(sort(unlist(data))), relative = FALSE)
# Write it
#write.csv(as.matrix(data), "data/transkr_expanded_words.csv")
data = read.csv("data/transkr_expanded_words.csv", header = TRUE, row.names = 1)
data = t(data)
nwords = colSums(data)
summary(nwords)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 294 2259 3543 5089 6783 18981
boxplot(nwords)
boxplot(nwords)$out
## 05_Ano_Leg-A_Ap_NA_Vie_Jacques 29_Wau_Leg-C_Co_Ev_Vie_Martin
## 18068 14458
## 31_Wau_Leg-C_Co_Ev_Dia_Martin3 34_Wau_Leg-C_Co_Ev_Vie_Martial
## 18981 15299
head(sort(nwords), n = 15)
## 03_Ano_Leg-A_Ap_NA_Mar_Jean 62_Ano_Leg-N_NA_NA_NA_Index
## 294 301
## 61_Ano_Leg-B_NA_NA_NA_Jugement 30_Wau_Leg-C_Co_Ev_Tra_Martin2
## 403 726
## 08_Ano_Leg-A_Ap_NA_Vie_Philippe 59_Ano_Leg-C_Vi_NA_Vie_Euphrasie
## 1011 1287
## 09_Ano_Leg-A_Ap_NA_Vie_JacquesMineur 32_Wau_Leg-C_Co_Ev_Vie_Brice
## 1350 1395
## 60_Ano_Leg-B_NA_NA_NA_Antechriste 54_Ano_Leg-C_Vi_NA_Vie_Pelagie
## 1501 1520
## 20_Ano_Leg-B_Ma_Fe_Vie_Felicite 11_Ano_Leg-A_Ap_NA_Vie_Marc
## 1675 1822
## 23_Ano_Leg-B_Ma_Ho_Vie_Sixte 53_Ano_Leg-C_Vi_NA_Vie_Marguerite
## 1890 1940
## 35_Wau_Leg-C_Co_Ev_Vie_Nicolas
## 1977
toKeep = colnames(data)[nwords > 1000]
toKeep = toKeep[grep("Bestiaire", toKeep, invert = TRUE)]
# nwords = colSums(data[, toKeep])
# summary(nwords)
# boxplot(nwords)
# boxplot(nwords)$out
# head(sort(nwords), n = 15)
# Testing this # Remove it after
# toKeep = toKeep = toKeep[!toKeep == "60_Ano_Leg-B_NA_NA_NA_Antechriste"]
df = as.data.frame(nwords)
ggplot(df, aes(x="", y=nwords)) + geom_violin() + geom_boxplot(width=0.3) + theme(axis.text.y = element_text(size = rel(1.4)), axis.title = element_text(size = rel(1.4))) + xlab("Est. length in words of corpus texts") + scale_y_continuous(breaks=c(0, 2500, 5000, 7500, 10000, 12500, 15000, 17500))
# Get data with Stylo
#data = stylo::load.corpus.and.parse(corpus.dir = "../dh-meier-data/output/transkribus-etudiants/raw/", features = "c", ngram.size = 3, preserve.case = FALSE)
# Get freq lists
#data = stylo::make.table.of.frequencies(corpus = data, features = unique(sort(unlist(data))), relative = FALSE)
# Write it
#write.csv(as.matrix(data), "data/transkr_raw_char3grams.csv")
data = read.csv("data/transkr_raw_char3grams.csv", header = TRUE, row.names = 1)
data = t(data)
data = data[, toKeep]
data = data[rowSums(data) > 0, ]
d = data
# Selection based on Moisl 2011
select = selection(d, z = 1.645)
select = select[,4]
# Normalisations
d = relativeFreqs(d)
# save data for robustness checks
Raw3grSave = d
d = d[select,]
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHRaw3gr = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotRaw3grams = cahPlotCol(myCAH, k = 5, main = "Characters 3-grams from raw data (Transkr)")
# somCAH = somCluster(d)
# somCAHRaw3gr = somCAH
# somplotRaw3grams = cahPlotCol(somCAH, k = 5, main = "SOM BASED - Characters 3-grams from raw data (Transkr)")
classes = cutree(myCAH, k = 5)
classes
## 00_Ano_Leg-A_Ap_Ev_Dis_Pierre1 01_Ano_Leg-A_Ap_NA_Vie_Pierre2
## 1 1
## 02_Ano_Leg-A_Ap_NA_Pas_Paul 04_Ano_Leg-A_Ap_NA_Vie_Jean_Ev
## 1 1
## 05_Ano_Leg-A_Ap_NA_Vie_Jacques 06_Ano_Leg-A_Ap_NA_Vie_Matthieu
## 1 2
## 07_Ano_Leg-A_Ap_NA_Vie_SimonJude 08_Ano_Leg-A_Ap_NA_Vie_Philippe
## 2 2
## 09_Ano_Leg-A_Ap_NA_Vie_JacquesMineur 10_Ano_Leg-A_Ap_NA_Vie_Barthelemy
## 2 2
## 11_Ano_Leg-A_Ap_NA_Vie_Marc 12_Ano_Leg-A_Ma_Ho_Vie_Longin
## 2 2
## 13_Ano_Leg-B_Ma_Ho_Vie_Sebastien 14_Ano_Leg-B_Ma_Ho_Vie_Vincent
## 2 2
## 15_Ano_Leg-B_Ma_Ho_Vie_Georges 16_Ano_Leg-B_Ma_Ho_Vie_Christophe
## 2 2
## 17_Ano_Leg-B_Ma_Fe_Vie_Agathe 18_Ano_Leg-B_Ma_Fe_Vie_Luce
## 2 2
## 19_Ano_Leg-B_Ma_Fe_Vie_Agnes 20_Ano_Leg-B_Ma_Fe_Vie_Felicite
## 2 2
## 21_Ano_Leg-B_Ma_Fe_Vie_Christine 22_Ano_Leg-B_Ma_Fe_Vie_Cecile
## 2 2
## 23_Ano_Leg-B_Ma_Ho_Vie_Sixte 24_Ano_Leg-B_Ma_Ho_Vie_Laurent
## 2 2
## 25_Ano_Leg-B_Ma_Ho_Vie_Hippolyte 26_Ano_Leg-B_Ma_Ev_Vie_Lambert
## 2 3
## 27_Ano_Leg-B_Ma_Ho_Vie_Pantaleon 28_Ano_Leg-B_Ma_Ho_Vie_Clement
## 4 5
## 29_Wau_Leg-C_Co_Ev_Vie_Martin 31_Wau_Leg-C_Co_Ev_Dia_Martin3
## 3 3
## 32_Wau_Leg-C_Co_Ev_Vie_Brice 33_Wau_Leg-C_Co_Er_Vie_Gilles
## 3 3
## 34_Wau_Leg-C_Co_Ev_Vie_Martial 35_Wau_Leg-C_Co_Ev_Vie_Nicolas
## 3 3
## 36_Wau_Leg-C_Co_Ev_Mir_Nicolas2 37_Wau_Leg-C_Co_Ev_Tra_Nicolas3
## 3 3
## 38_Wau_Leg-C_Co_Ev_Vie_Jerome 39_Wau_Leg-C_Co_Ev_Vie_Benoit
## 3 3
## 40_Wau_Leg-C_Co_Er_Vie_Alexis 41_Ano_Leg-C_Vi_NA_Vie_Irene
## 3 5
## 42_Ano_Leg-B_Vi_NA_Ass_NotreDame 43_Ano_Leg-C_Vi_NA_Vie_Catherine
## 4 4
## 44_Ano_Leg-C_Ap_NA_Vie_Andre 45_Ano_Leg-C_Ap_NA_Pas_Andre2
## 4 4
## 46_Ano_Leg-B_Co_NA_Pur_Patrice 47_Ano_Leg-C_Co_er_Vie_PaulErmite
## 5 5
## 48_Ano_Leg-C_Co_ev_Tra_Benoit2 49_Ano_Leg-C_NA_NA_Vie_Maur
## 5 5
## 50_Ano_Leg-C_NA_NA_Vie_Placide 51_Ano_Leg-C_Ma_ho_Vie_Eustache
## 5 5
## 52_Ano_Leg-C_Co_NA_Vie_Fursi 53_Ano_Leg-C_Vi_NA_Vie_Marguerite
## 5 5
## 54_Ano_Leg-C_Vi_NA_Vie_Pelagie 55_Ano_Leg-C_Co_NA_Vie_Simeon
## 5 5
## 56_Ano_Leg-C_Co_NA_Vie_Mamertin 57_Ano_Leg-C_Vi_NA_Vie_Julien
## 5 5
## 58_Ano_Leg-C_Vi_NA_Vie_MarieEgyptienne 59_Ano_Leg-C_Vi_NA_Vie_Euphrasie
## 5 5
## 60_Ano_Leg-B_NA_NA_NA_Antechriste
## 4
nfeats = 10
values = c(head(sort(maDesc$quanti$`3`[,1], decreasing = TRUE), n = nfeats), head(sort(maDesc$quanti$`3`[,1]), n = nfeats))
classBarplot(values, title="V-test for Wauchier class", ylab = "v-test")
Example of two main feats of Wauchier class
class = as.factor(classes)
levels(class) = classlabels
levels(class) = c(levels(class), "LAMB")
class["26_Ano_Leg-B_Ma_Ev_Vie_Lambert"] = "LAMB"
rf = cbind(as.data.frame(t(relativeFreqs(data))), class)
qplot(q.i.l, o.m.., colour=class, data = rf)
specifPlot(data, myCAH, k = 5)
data = read.csv("data/transkr_expanded_words.csv", header = TRUE, row.names = 1)
data = t(data)
data = data[, toKeep]
data = data[rowSums(data) > 0, ]
dataWords = data
d = data
# Selection based on Moisl 2011
select = selection(d, z = 1.645)
select = select[,4]
# Normalisations
d = relativeFreqs(d)
# save data for robustness checks
d = d[select,]
WordsSave = d
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHForms = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotForms = cahPlotCol(myCAH, k = 5, main = "Expanded word forms (Transkr/Boudams/Pie)")
# somCAH = somCluster(d)
# somCAHForms = somCAH
# somplotForms = cahPlotCol(somCAH, k = 5, main = "SOM BASED - Expanded word forms (Transkr/Boudams/Pie)")
# Creating affixes database from all words
dataAffs = countAffixes(data)
d = dataAffs
# Selection based on Moisl 2011
select = selection(d, z = 1.645)
select = select[,4]
# Normalisations
d = relativeFreqs(d)
d = d[select,]
AffixesSave = d
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHAffs = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotAffixes = cahPlotCol(myCAH, k = 5, main = "Expanded affixes (Transkr/Boudams/Pie)")
# somCAH = somCluster(d)
# somCAHAffs = somCAH
# somplotAffixes = cahPlotCol(somCAH, k = 5, main = "SOM BASED - Expanded affixes (Transkr/Boudams/Pie)")
#labels(sort(rowSums(data), decreasing = TRUE)[1:300])
# Avec ou sans pronoms ?
functionWords = source("functionWords.R")$value
dataFW = data
d = relativeFreqs(data)
d = d[functionWords,]
# save data for robustness checks
FWSave = d
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHFW = myCAH
# barplot(sort(myCAH$height))
plotFW = cahPlotCol(myCAH, k = 5, main = "Function words with pronouns and auxiliaries\n(Transkr/Boudams/Pie)")
#plotCol(myCAH, main = "toto")
# somCAH = somCluster(d)
# somCAHFW = somCAH
# somplotFW = cahPlotCol(somCAH, k = 5, main = "SOM BASED - Function words")
data = read.csv("data/transkr_pos3-gr.csv", header = TRUE, row.names = 1, sep = ";")
#remove total freq
data = data[, -1]
colnames(data) = gsub("^X", "", colnames(data))
colnames(data) = gsub(".decolumnized", "", colnames(data))
colnames(data) = gsub("Leg.", "Leg-", colnames(data))
data = data[, toKeep]
data = data[rowSums(data) > 0, ]
data = as.matrix(data)
dataPOS3gr = data
d = data
# Selection based on Moisl 2011
select = selection(d, z = 1.645)
write.csv(select, file="data/select_pos3gr_moisl.csv")
select = select[,4]
# Normalisations
d = relativeFreqs(d)
# save data for robustness checks
d = d[select,]
POS3grSave = d
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHPOS3gr = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotPOS3grams = cahPlotCol(myCAH, k = 5, main = "POS 3-grams (Transkr/Boudams/Pie/Pie)")
# somCAH = somCluster(d)
# somCAHPOS3gr = somCAH
# somplotPOS3grams = cahPlotCol(somCAH, k = 5, main = "SOM BASED - POS 3-grams")
data = read.csv("data/transkr_lemmas.csv", header = TRUE, row.names = 1, sep = ";")
#remove total freq
data = data[, -1]
colnames(data) = gsub("^X", "", colnames(data))
colnames(data) = gsub(".decolumnized", "", colnames(data))
colnames(data) = gsub("Leg.", "Leg-", colnames(data))
data = data[, toKeep]
data = data[rowSums(data) > 0, ]
data = as.matrix(data)
dataLemmas = data
d = data
# Selection based on Moisl 2011
select = selection(d, z = 1.645)
write.csv(select, file="data/select_lemmas_moisl.csv")
select = select[,4]
# Normalisations
d = relativeFreqs(d)
d = d[select,]
LemmasSave = d
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHLemmas = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotLemmas = cahPlotCol(myCAH, k = 5, main = "Lemmas (Transkr/Boudams/Pie/Pie)")
# somCAH = somCluster(d)
# somCAHLemmas = somCAH
# somplotLemmas = cahPlotCol(somCAH, k = 5, main = "SOM BASED - Lemmas")
# Find function words
#rownames(data)[1:250]
functionLemmas = source("functionLemmas.R")$value
d = relativeFreqs(data)
d = d[functionLemmas,]
FLSave = d
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHFL = myCAH
# barplot(sort(myCAH$height))
plotFL = cahPlotCol(myCAH, k = 5, main = "Function Lemmas with pronouns and auxiliaries\n(Transkr/Boudams/Pie)")
#plotCol(myCAH, main = "toto")
# somCAH = somCluster(d)
# somCAHFL = somCAH
# somplotFL = cahPlotCol(somCAH, k = 5, main = "SOM BASED - Function words (lemmas)")
data = rbind(AffixesSave, POS3grSave, FLSave)
d = normalisations(data)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHGlob = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotGlob = cahPlotCol(myCAH, k = 5, main = "Affixes + POS 3- grams + Function words (lemmas)")
# somCAH = somCluster(d)
# somCAHGlob = somCAH
# somplotGlob = cahPlotCol(somCAH, k = 5, main = "SOM BASED - Affixes + POS 3- grams + Function words (lemmas)")
data = rbind(AffixesSave, POS3grSave, FWSave)
d = normalisations(data)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHGlob2 = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotGlob2 = cahPlotCol(myCAH, k = 5, main = "Affixes + POS 3- grams + Function words (unnorm.)")
# somCAH = somCluster(d)
# somCAHGlob2 = somCAH
# somplotGlob2 = cahPlotCol(somCAH, k = 5, main = "SOM BASED - Affixes + POS 3- grams + Function words (unnorm.)")
classes = cutree(myCAH, k = 5)
classes
## 00_Ano_Leg-A_Ap_Ev_Dis_Pierre1 01_Ano_Leg-A_Ap_NA_Vie_Pierre2
## 1 1
## 02_Ano_Leg-A_Ap_NA_Pas_Paul 04_Ano_Leg-A_Ap_NA_Vie_Jean_Ev
## 1 1
## 05_Ano_Leg-A_Ap_NA_Vie_Jacques 06_Ano_Leg-A_Ap_NA_Vie_Matthieu
## 1 2
## 07_Ano_Leg-A_Ap_NA_Vie_SimonJude 08_Ano_Leg-A_Ap_NA_Vie_Philippe
## 2 2
## 09_Ano_Leg-A_Ap_NA_Vie_JacquesMineur 10_Ano_Leg-A_Ap_NA_Vie_Barthelemy
## 2 2
## 11_Ano_Leg-A_Ap_NA_Vie_Marc 12_Ano_Leg-A_Ma_Ho_Vie_Longin
## 2 2
## 13_Ano_Leg-B_Ma_Ho_Vie_Sebastien 14_Ano_Leg-B_Ma_Ho_Vie_Vincent
## 2 2
## 15_Ano_Leg-B_Ma_Ho_Vie_Georges 16_Ano_Leg-B_Ma_Ho_Vie_Christophe
## 2 2
## 17_Ano_Leg-B_Ma_Fe_Vie_Agathe 18_Ano_Leg-B_Ma_Fe_Vie_Luce
## 2 2
## 19_Ano_Leg-B_Ma_Fe_Vie_Agnes 20_Ano_Leg-B_Ma_Fe_Vie_Felicite
## 2 2
## 21_Ano_Leg-B_Ma_Fe_Vie_Christine 22_Ano_Leg-B_Ma_Fe_Vie_Cecile
## 2 2
## 23_Ano_Leg-B_Ma_Ho_Vie_Sixte 24_Ano_Leg-B_Ma_Ho_Vie_Laurent
## 2 2
## 25_Ano_Leg-B_Ma_Ho_Vie_Hippolyte 26_Ano_Leg-B_Ma_Ev_Vie_Lambert
## 2 3
## 27_Ano_Leg-B_Ma_Ho_Vie_Pantaleon 28_Ano_Leg-B_Ma_Ho_Vie_Clement
## 2 4
## 29_Wau_Leg-C_Co_Ev_Vie_Martin 31_Wau_Leg-C_Co_Ev_Dia_Martin3
## 3 3
## 32_Wau_Leg-C_Co_Ev_Vie_Brice 33_Wau_Leg-C_Co_Er_Vie_Gilles
## 3 3
## 34_Wau_Leg-C_Co_Ev_Vie_Martial 35_Wau_Leg-C_Co_Ev_Vie_Nicolas
## 3 3
## 36_Wau_Leg-C_Co_Ev_Mir_Nicolas2 37_Wau_Leg-C_Co_Ev_Tra_Nicolas3
## 3 3
## 38_Wau_Leg-C_Co_Ev_Vie_Jerome 39_Wau_Leg-C_Co_Ev_Vie_Benoit
## 3 3
## 40_Wau_Leg-C_Co_Er_Vie_Alexis 41_Ano_Leg-C_Vi_NA_Vie_Irene
## 3 5
## 42_Ano_Leg-B_Vi_NA_Ass_NotreDame 43_Ano_Leg-C_Vi_NA_Vie_Catherine
## 5 5
## 44_Ano_Leg-C_Ap_NA_Vie_Andre 45_Ano_Leg-C_Ap_NA_Pas_Andre2
## 5 5
## 46_Ano_Leg-B_Co_NA_Pur_Patrice 47_Ano_Leg-C_Co_er_Vie_PaulErmite
## 4 4
## 48_Ano_Leg-C_Co_ev_Tra_Benoit2 49_Ano_Leg-C_NA_NA_Vie_Maur
## 4 4
## 50_Ano_Leg-C_NA_NA_Vie_Placide 51_Ano_Leg-C_Ma_ho_Vie_Eustache
## 4 4
## 52_Ano_Leg-C_Co_NA_Vie_Fursi 53_Ano_Leg-C_Vi_NA_Vie_Marguerite
## 4 4
## 54_Ano_Leg-C_Vi_NA_Vie_Pelagie 55_Ano_Leg-C_Co_NA_Vie_Simeon
## 4 4
## 56_Ano_Leg-C_Co_NA_Vie_Mamertin 57_Ano_Leg-C_Vi_NA_Vie_Julien
## 4 4
## 58_Ano_Leg-C_Vi_NA_Vie_MarieEgyptienne 59_Ano_Leg-C_Vi_NA_Vie_Euphrasie
## 4 4
## 60_Ano_Leg-B_NA_NA_NA_Antechriste
## 5
nfeats = 10
values = c(head(sort(maDesc$quanti$`3`[,1], decreasing = TRUE), n = nfeats), head(sort(maDesc$quanti$`3`[,1]), n = nfeats))
classBarplot(values, title="V-test for Wauchier class", ylab = "v-test")
Example of two main feats of Wauchier class
class = as.factor(classes)
levels(class) = classlabels
levels(class) = c(levels(class), "LAMB")
class["26_Ano_Leg-B_Ma_Ev_Vie_Lambert"] = "LAMB"
#NB:
rf = cbind(as.data.frame(t(relativeFreqs(data))), class)
qplot(qil, om_, colour=class, data = rf)
data = rbind(dataAffs, dataPOS3gr, dataFW)
specifPlot(data, myCAH, k = 5)
data = rbind(AffixesSave, POS3grSave, FWSave, FLSave)
d = normalisations(data)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHGlob3 = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotGlob3 = cahPlotCol(myCAH, k = 5, main = "Affixes + POS 3- grams + Function words (both)")
# somCAH = somCluster(d)
# somCAHGlob3 = somCAH
# somplotGlob3 = cahPlotCol(somCAH, k = 5, main = "SOM BASED - Affixes + POS 3- grams + Function words (unnorm.)")
data = rbind(LemmasSave, WordsSave)
d = normalisations(data)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHWordsLemmas = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotWordsLemmas = cahPlotCol(myCAH, k = 5, main = "Word forms + lemmas")
# somCAH = somCluster(d)
# somCAHWordsLemmas = somCAH
# somplotWordsLemmas = cahPlotCol(somCAH, k = 5, main = "SOM BASED - Word forms + lemmas")
classes = cutree(myCAH, k = 5)
classes
## 00_Ano_Leg-A_Ap_Ev_Dis_Pierre1 01_Ano_Leg-A_Ap_NA_Vie_Pierre2
## 1 1
## 02_Ano_Leg-A_Ap_NA_Pas_Paul 04_Ano_Leg-A_Ap_NA_Vie_Jean_Ev
## 1 1
## 05_Ano_Leg-A_Ap_NA_Vie_Jacques 06_Ano_Leg-A_Ap_NA_Vie_Matthieu
## 1 2
## 07_Ano_Leg-A_Ap_NA_Vie_SimonJude 08_Ano_Leg-A_Ap_NA_Vie_Philippe
## 2 2
## 09_Ano_Leg-A_Ap_NA_Vie_JacquesMineur 10_Ano_Leg-A_Ap_NA_Vie_Barthelemy
## 2 2
## 11_Ano_Leg-A_Ap_NA_Vie_Marc 12_Ano_Leg-A_Ma_Ho_Vie_Longin
## 3 3
## 13_Ano_Leg-B_Ma_Ho_Vie_Sebastien 14_Ano_Leg-B_Ma_Ho_Vie_Vincent
## 3 3
## 15_Ano_Leg-B_Ma_Ho_Vie_Georges 16_Ano_Leg-B_Ma_Ho_Vie_Christophe
## 3 3
## 17_Ano_Leg-B_Ma_Fe_Vie_Agathe 18_Ano_Leg-B_Ma_Fe_Vie_Luce
## 3 3
## 19_Ano_Leg-B_Ma_Fe_Vie_Agnes 20_Ano_Leg-B_Ma_Fe_Vie_Felicite
## 3 3
## 21_Ano_Leg-B_Ma_Fe_Vie_Christine 22_Ano_Leg-B_Ma_Fe_Vie_Cecile
## 3 3
## 23_Ano_Leg-B_Ma_Ho_Vie_Sixte 24_Ano_Leg-B_Ma_Ho_Vie_Laurent
## 3 3
## 25_Ano_Leg-B_Ma_Ho_Vie_Hippolyte 26_Ano_Leg-B_Ma_Ev_Vie_Lambert
## 3 4
## 27_Ano_Leg-B_Ma_Ho_Vie_Pantaleon 28_Ano_Leg-B_Ma_Ho_Vie_Clement
## 3 5
## 29_Wau_Leg-C_Co_Ev_Vie_Martin 31_Wau_Leg-C_Co_Ev_Dia_Martin3
## 4 4
## 32_Wau_Leg-C_Co_Ev_Vie_Brice 33_Wau_Leg-C_Co_Er_Vie_Gilles
## 4 4
## 34_Wau_Leg-C_Co_Ev_Vie_Martial 35_Wau_Leg-C_Co_Ev_Vie_Nicolas
## 4 4
## 36_Wau_Leg-C_Co_Ev_Mir_Nicolas2 37_Wau_Leg-C_Co_Ev_Tra_Nicolas3
## 4 4
## 38_Wau_Leg-C_Co_Ev_Vie_Jerome 39_Wau_Leg-C_Co_Ev_Vie_Benoit
## 4 4
## 40_Wau_Leg-C_Co_Er_Vie_Alexis 41_Ano_Leg-C_Vi_NA_Vie_Irene
## 4 5
## 42_Ano_Leg-B_Vi_NA_Ass_NotreDame 43_Ano_Leg-C_Vi_NA_Vie_Catherine
## 1 1
## 44_Ano_Leg-C_Ap_NA_Vie_Andre 45_Ano_Leg-C_Ap_NA_Pas_Andre2
## 1 1
## 46_Ano_Leg-B_Co_NA_Pur_Patrice 47_Ano_Leg-C_Co_er_Vie_PaulErmite
## 5 5
## 48_Ano_Leg-C_Co_ev_Tra_Benoit2 49_Ano_Leg-C_NA_NA_Vie_Maur
## 5 5
## 50_Ano_Leg-C_NA_NA_Vie_Placide 51_Ano_Leg-C_Ma_ho_Vie_Eustache
## 5 5
## 52_Ano_Leg-C_Co_NA_Vie_Fursi 53_Ano_Leg-C_Vi_NA_Vie_Marguerite
## 5 5
## 54_Ano_Leg-C_Vi_NA_Vie_Pelagie 55_Ano_Leg-C_Co_NA_Vie_Simeon
## 5 5
## 56_Ano_Leg-C_Co_NA_Vie_Mamertin 57_Ano_Leg-C_Vi_NA_Vie_Julien
## 5 5
## 58_Ano_Leg-C_Vi_NA_Vie_MarieEgyptienne 59_Ano_Leg-C_Vi_NA_Vie_Euphrasie
## 5 5
## 60_Ano_Leg-B_NA_NA_NA_Antechriste
## 5
nfeats = 10
values = c(head(sort(maDesc$quanti$`4`[,1], decreasing = TRUE), n = nfeats), head(sort(maDesc$quanti$`4`[,1]), n = nfeats))
classBarplot(values, title="V-test for Wauchier class", ylab = "v-test")
Example of two main feats of Wauchier class
class = as.factor(classes)
levels(class) = classlabels
levels(class) = c(levels(class), "LAMB")
class["26_Ano_Leg-B_Ma_Ev_Vie_Lambert"] = "LAMB"
#NB:
rf = cbind(as.data.frame(t(relativeFreqs(data))), class)
rf = rf[, c("ensemble", "qil", "que")]
# Two main of Wauchier class
qplot(qil, ensemble, colour=class, data = rf)
# TWO MOST CORRELATED TO CLUSTERS
qplot(qil, que, colour=class, data = rf)
#TODO: fix to take only the one that have been actually selected by the Moisl formula
data = rbind(dataLemmas, dataWords)
specifPlot(data, myCAH, k = 5)
gridExtra::grid.arrange(plotRaw3grams, plotGlob2, plotWordsLemmas, ncol = 1)
#featlabel = "features of ME ±2σ with conf. > 90%"
#A = cahPlotCol(CAHLemma, main = "A", xlab = paste( ncol(CAHLemma$data), featlabel), k = 6, lrect = -12)
# B = cahPlotCol(CAHRhyme, main = "B", xlab = paste( ncol(CAHRhyme$data), featlabel), k = 6, lrect = -7, ylab = " ")
# C = cahPlotCol(CAHAllWords, main = "C", xlab = paste( ncol(CAHAllWords$data), featlabel), k = 6, ylab = " ")
# D = cahPlotCol(CAHAffs, main = "D", xlab = paste( ncol(CAHAffs$data), featlabel), k = 6, ylab = " ")
# E = cahPlotCol(CAHPOS3gr, main = "E", xlab = paste( ncol(CAHPOS3gr$data), featlabel), k = 6, lrect = -12 , ylab = " ")
# F = cahPlotCol(CAHmfw, main = "F", k = 6, lrect = -5, ylab = " ")
# gridExtra::grid.arrange(A, B, C, D, E, F, ncol = 2)
gridExtra::grid.arrange(plotAffixes, plotFW, plotFL, plotPOS3grams, plotForms, plotLemmas, ncol = 2)
gridExtra::grid.arrange(plotGlob, plotGlob2, plotGlob3, ncol = 1)
cahList = list(raw3grams = CAHRaw3gr, Affs = CAHAffs, FunctWords = CAHFW, FunctLemm = CAHFL, POS3gr = CAHPOS3gr, FWPOSandAffs = CAHGlob2, Forms = CAHForms, Lemmas = CAHLemmas, WordsLemmas = CAHWordsLemmas)
#compareHC(cahList, k = 5)
benchmark = benchmarkHC(CAHRaw3gr, cahList, k = 5)
round(benchmark, digits = 2)
## N AC CPMeyer CPREF
## raw3grams 1359 0.69 0.78 1.00
## Affs 766 0.65 0.83 0.90
## FunctWords 171 0.71 0.83 0.86
## FunctLemm 100 0.69 0.71 0.80
## POS3gr 314 0.67 0.71 0.75
## FWPOSandAffs 1251 0.64 0.80 0.97
## Forms 677 0.62 0.69 0.81
## Lemmas 525 0.59 0.68 0.73
## WordsLemmas 1202 0.62 0.83 0.90
# # Now with SOM
# cahSOMList = list(raw3grams = somCAHRaw3gr, Affs = somCAHAffs, FunctLemm = somCAHFL, POS3gr = somCAHPOS3gr, FLPOSandAffs = somCAHGlob, FWPOSandAffs = somCAHGlob2, FLFWPOSandAffs = somCAHGlob3, Forms = somCAHForms, Lemmas = somCAHLemmas, WordsLemmas = somCAHWordsLemmas, UnnormFW = somCAHFW)
#
# benchmark = benchmarkHC(CAHRaw3gr, cahSOMList, k = 5)
# round(benchmark, digits = 2)
# ONLY on the three reference analyses
cahList = list(raw3grams = CAHRaw3gr, FWPOSandAffs = CAHGlob2, WordsLemmas = CAHWordsLemmas)
vol = volatility(cahList, k = 5)
volRef = merge(round(vol, digits = 2), nwords, by="row.names", all.x=TRUE, all.y=FALSE)
volRef[order(volRef[, "V_i"]), ]
## Row.names V_i y
## 59 60_Ano_Leg-B_NA_NA_NA_Antechriste -0.12 1501
## 27 27_Ano_Leg-B_Ma_Ho_Vie_Pantaleon 0.10 6563
## 41 42_Ano_Leg-B_Vi_NA_Ass_NotreDame 0.17 3119
## 42 43_Ano_Leg-C_Vi_NA_Vie_Catherine 0.17 8930
## 43 44_Ano_Leg-C_Ap_NA_Vie_Andre 0.17 3129
## 44 45_Ano_Leg-C_Ap_NA_Pas_Andre2 0.17 13349
## 40 41_Ano_Leg-C_Vi_NA_Vie_Irene 0.24 3176
## 1 00_Ano_Leg-A_Ap_Ev_Dis_Pierre1 0.41 6783
## 2 01_Ano_Leg-A_Ap_NA_Vie_Pierre2 0.41 5539
## 3 02_Ano_Leg-A_Ap_NA_Pas_Paul 0.41 4831
## 4 04_Ano_Leg-A_Ap_NA_Vie_Jean_Ev 0.41 4977
## 5 05_Ano_Leg-A_Ap_NA_Vie_Jacques 0.41 18068
## 6 06_Ano_Leg-A_Ap_NA_Vie_Matthieu 0.46 6490
## 7 07_Ano_Leg-A_Ap_NA_Vie_SimonJude 0.46 6835
## 8 08_Ano_Leg-A_Ap_NA_Vie_Philippe 0.46 1011
## 9 09_Ano_Leg-A_Ap_NA_Vie_JacquesMineur 0.46 1350
## 10 10_Ano_Leg-A_Ap_NA_Vie_Barthelemy 0.46 4349
## 11 11_Ano_Leg-A_Ap_NA_Vie_Marc 0.81 1822
## 12 12_Ano_Leg-A_Ma_Ho_Vie_Longin 0.81 2259
## 13 13_Ano_Leg-B_Ma_Ho_Vie_Sebastien 0.81 3543
## 14 14_Ano_Leg-B_Ma_Ho_Vie_Vincent 0.81 4836
## 15 15_Ano_Leg-B_Ma_Ho_Vie_Georges 0.81 4535
## 16 16_Ano_Leg-B_Ma_Ho_Vie_Christophe 0.81 9061
## 17 17_Ano_Leg-B_Ma_Fe_Vie_Agathe 0.81 3122
## 18 18_Ano_Leg-B_Ma_Fe_Vie_Luce 0.81 2395
## 19 19_Ano_Leg-B_Ma_Fe_Vie_Agnes 0.81 4212
## 20 20_Ano_Leg-B_Ma_Fe_Vie_Felicite 0.81 1675
## 21 21_Ano_Leg-B_Ma_Fe_Vie_Christine 0.81 7513
## 22 22_Ano_Leg-B_Ma_Fe_Vie_Cecile 0.81 6842
## 23 23_Ano_Leg-B_Ma_Ho_Vie_Sixte 0.81 1890
## 24 24_Ano_Leg-B_Ma_Ho_Vie_Laurent 0.81 3255
## 25 25_Ano_Leg-B_Ma_Ho_Vie_Hippolyte 0.81 2538
## 28 28_Ano_Leg-B_Ma_Ho_Vie_Clement 0.88 2567
## 45 46_Ano_Leg-B_Co_NA_Pur_Patrice 0.88 7885
## 46 47_Ano_Leg-C_Co_er_Vie_PaulErmite 0.88 3773
## 47 48_Ano_Leg-C_Co_ev_Tra_Benoit2 0.88 3276
## 48 49_Ano_Leg-C_NA_NA_Vie_Maur 0.88 6340
## 49 50_Ano_Leg-C_NA_NA_Vie_Placide 0.88 2798
## 50 51_Ano_Leg-C_Ma_ho_Vie_Eustache 0.88 3134
## 51 52_Ano_Leg-C_Co_NA_Vie_Fursi 0.88 2482
## 52 53_Ano_Leg-C_Vi_NA_Vie_Marguerite 0.88 1940
## 53 54_Ano_Leg-C_Vi_NA_Vie_Pelagie 0.88 1520
## 54 55_Ano_Leg-C_Co_NA_Vie_Simeon 0.88 2894
## 55 56_Ano_Leg-C_Co_NA_Vie_Mamertin 0.88 2225
## 56 57_Ano_Leg-C_Vi_NA_Vie_Julien 0.88 2797
## 57 58_Ano_Leg-C_Vi_NA_Vie_MarieEgyptienne 0.88 5588
## 58 59_Ano_Leg-C_Vi_NA_Vie_Euphrasie 0.88 1287
## 26 26_Ano_Leg-B_Ma_Ev_Vie_Lambert 1.00 5271
## 29 29_Wau_Leg-C_Co_Ev_Vie_Martin 1.00 14458
## 30 31_Wau_Leg-C_Co_Ev_Dia_Martin3 1.00 18981
## 31 32_Wau_Leg-C_Co_Ev_Vie_Brice 1.00 1395
## 32 33_Wau_Leg-C_Co_Er_Vie_Gilles 1.00 4433
## 33 34_Wau_Leg-C_Co_Ev_Vie_Martial 1.00 15299
## 34 35_Wau_Leg-C_Co_Ev_Vie_Nicolas 1.00 1977
## 35 36_Wau_Leg-C_Co_Ev_Mir_Nicolas2 1.00 10532
## 36 37_Wau_Leg-C_Co_Ev_Tra_Nicolas3 1.00 8403
## 37 38_Wau_Leg-C_Co_Ev_Vie_Jerome 1.00 2436
## 38 39_Wau_Leg-C_Co_Ev_Vie_Benoit 1.00 12827
## 39 40_Wau_Leg-C_Co_Er_Vie_Alexis 1.00 4120
# see if there is a correlation
reg = lm(volRef[, 3] ~ volRef[, 2])
summary(reg)
##
## Call:
## lm(formula = volRef[, 3] ~ volRef[, 2])
##
## Residuals:
## Min 1Q Median 3Q Max
## -4211 -2826 -1504 1478 13688
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5162.1 1495.8 3.451 0.00106 **
## volRef[, 2] 130.8 1926.9 0.068 0.94611
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4219 on 57 degrees of freedom
## Multiple R-squared: 8.085e-05, Adjusted R-squared: -0.01746
## F-statistic: 0.004609 on 1 and 57 DF, p-value: 0.9461
plot(volRef[, 2], volRef[, 3])
abline(reg)
# Et la distrib des VI
boxplot(volRef[, 2])
hist(volRef[, 2])
# ONLY on the three reference analyses
cahList = list(Affs = CAHAffs, FunctWords = CAHFW, FunctLemm = CAHFL, POS3gr = CAHPOS3gr, Forms = CAHForms, Lemmas = CAHLemmas)
vol = volatility(cahList, k = 5)
volSuppl = merge(round(vol, digits = 2), nwords, by="row.names", all.x=TRUE, all.y=FALSE)
volSuppl[order(volSuppl[, "V_i"]), ]
## Row.names V_i y
## 5 05_Ano_Leg-A_Ap_NA_Vie_Jacques -0.50 18068
## 28 28_Ano_Leg-B_Ma_Ho_Vie_Clement -0.40 2567
## 8 08_Ano_Leg-A_Ap_NA_Vie_Philippe -0.37 1011
## 9 09_Ano_Leg-A_Ap_NA_Vie_JacquesMineur -0.37 1350
## 44 45_Ano_Leg-C_Ap_NA_Pas_Andre2 -0.33 13349
## 39 40_Wau_Leg-C_Co_Er_Vie_Alexis -0.28 4120
## 59 60_Ano_Leg-B_NA_NA_NA_Antechriste -0.28 1501
## 27 27_Ano_Leg-B_Ma_Ho_Vie_Pantaleon -0.22 6563
## 11 11_Ano_Leg-A_Ap_NA_Vie_Marc -0.21 1822
## 14 14_Ano_Leg-B_Ma_Ho_Vie_Vincent -0.18 4836
## 41 42_Ano_Leg-B_Vi_NA_Ass_NotreDame -0.18 3119
## 6 06_Ano_Leg-A_Ap_NA_Vie_Matthieu -0.15 6490
## 7 07_Ano_Leg-A_Ap_NA_Vie_SimonJude -0.15 6835
## 10 10_Ano_Leg-A_Ap_NA_Vie_Barthelemy -0.15 4349
## 1 00_Ano_Leg-A_Ap_Ev_Dis_Pierre1 -0.13 6783
## 2 01_Ano_Leg-A_Ap_NA_Vie_Pierre2 -0.13 5539
## 3 02_Ano_Leg-A_Ap_NA_Pas_Paul -0.13 4831
## 4 04_Ano_Leg-A_Ap_NA_Vie_Jean_Ev -0.13 4977
## 42 43_Ano_Leg-C_Vi_NA_Vie_Catherine -0.13 8930
## 43 44_Ano_Leg-C_Ap_NA_Vie_Andre -0.13 3129
## 40 41_Ano_Leg-C_Vi_NA_Vie_Irene -0.11 3176
## 13 13_Ano_Leg-B_Ma_Ho_Vie_Sebastien -0.10 3543
## 16 16_Ano_Leg-B_Ma_Ho_Vie_Christophe -0.10 9061
## 22 22_Ano_Leg-B_Ma_Fe_Vie_Cecile -0.10 6842
## 26 26_Ano_Leg-B_Ma_Ev_Vie_Lambert -0.07 5271
## 31 32_Wau_Leg-C_Co_Ev_Vie_Brice -0.06 1395
## 33 34_Wau_Leg-C_Co_Ev_Vie_Martial -0.05 15299
## 34 35_Wau_Leg-C_Co_Ev_Vie_Nicolas -0.05 1977
## 37 38_Wau_Leg-C_Co_Ev_Vie_Jerome -0.03 2436
## 17 17_Ano_Leg-B_Ma_Fe_Vie_Agathe -0.02 3122
## 45 46_Ano_Leg-B_Co_NA_Pur_Patrice -0.02 7885
## 46 47_Ano_Leg-C_Co_er_Vie_PaulErmite -0.02 3773
## 58 59_Ano_Leg-C_Vi_NA_Vie_Euphrasie -0.02 1287
## 52 53_Ano_Leg-C_Vi_NA_Vie_Marguerite -0.01 1940
## 12 12_Ano_Leg-A_Ma_Ho_Vie_Longin 0.00 2259
## 20 20_Ano_Leg-B_Ma_Fe_Vie_Felicite 0.01 1675
## 23 23_Ano_Leg-B_Ma_Ho_Vie_Sixte 0.01 1890
## 24 24_Ano_Leg-B_Ma_Ho_Vie_Laurent 0.01 3255
## 25 25_Ano_Leg-B_Ma_Ho_Vie_Hippolyte 0.01 2538
## 50 51_Ano_Leg-C_Ma_ho_Vie_Eustache 0.02 3134
## 18 18_Ano_Leg-B_Ma_Fe_Vie_Luce 0.05 2395
## 19 19_Ano_Leg-B_Ma_Fe_Vie_Agnes 0.05 4212
## 15 15_Ano_Leg-B_Ma_Ho_Vie_Georges 0.12 4535
## 21 21_Ano_Leg-B_Ma_Fe_Vie_Christine 0.12 7513
## 53 54_Ano_Leg-C_Vi_NA_Vie_Pelagie 0.26 1520
## 55 56_Ano_Leg-C_Co_NA_Vie_Mamertin 0.26 2225
## 56 57_Ano_Leg-C_Vi_NA_Vie_Julien 0.26 2797
## 29 29_Wau_Leg-C_Co_Ev_Vie_Martin 0.57 14458
## 30 31_Wau_Leg-C_Co_Ev_Dia_Martin3 0.57 18981
## 32 33_Wau_Leg-C_Co_Er_Vie_Gilles 0.57 4433
## 35 36_Wau_Leg-C_Co_Ev_Mir_Nicolas2 0.57 10532
## 36 37_Wau_Leg-C_Co_Ev_Tra_Nicolas3 0.57 8403
## 38 39_Wau_Leg-C_Co_Ev_Vie_Benoit 0.57 12827
## 47 48_Ano_Leg-C_Co_ev_Tra_Benoit2 0.57 3276
## 48 49_Ano_Leg-C_NA_NA_Vie_Maur 0.57 6340
## 49 50_Ano_Leg-C_NA_NA_Vie_Placide 0.57 2798
## 51 52_Ano_Leg-C_Co_NA_Vie_Fursi 0.57 2482
## 54 55_Ano_Leg-C_Co_NA_Vie_Simeon 0.57 2894
## 57 58_Ano_Leg-C_Vi_NA_Vie_MarieEgyptienne 0.57 5588
# see if there is a correlation
reg = lm(volSuppl[, 3] ~ volSuppl[, 2])
summary(reg)
##
## Call:
## lm(formula = volSuppl[, 3] ~ volSuppl[, 2])
##
## Residuals:
## Min 1Q Median 3Q Max
## -4182 -2973 -1410 1874 13946
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5161.1 549.3 9.395 3.52e-13 ***
## volSuppl[, 2] 2078.9 1804.1 1.152 0.254
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4171 on 57 degrees of freedom
## Multiple R-squared: 0.02276, Adjusted R-squared: 0.00562
## F-statistic: 1.328 on 1 and 57 DF, p-value: 0.254
plot(volSuppl[, 2], volSuppl[, 3])
abline(reg)
# Et la distrib des VI
boxplot(volSuppl[, 2])
hist(volSuppl[, 2])
out = merge(volRef, volSuppl, by="row.names", all.x=TRUE, all.y=TRUE)
rownames(out) = out[, 2]
out = out[, c(4, 3, 6)]
colnames(out) = c("NWords", "V_iRef", "V_iSuppl")
out[order(out[, 2]),]
## NWords V_iRef V_iSuppl
## 60_Ano_Leg-B_NA_NA_NA_Antechriste 1501 -0.12 -0.28
## 27_Ano_Leg-B_Ma_Ho_Vie_Pantaleon 6563 0.10 -0.22
## 42_Ano_Leg-B_Vi_NA_Ass_NotreDame 3119 0.17 -0.18
## 43_Ano_Leg-C_Vi_NA_Vie_Catherine 8930 0.17 -0.13
## 44_Ano_Leg-C_Ap_NA_Vie_Andre 3129 0.17 -0.13
## 45_Ano_Leg-C_Ap_NA_Pas_Andre2 13349 0.17 -0.33
## 41_Ano_Leg-C_Vi_NA_Vie_Irene 3176 0.24 -0.11
## 00_Ano_Leg-A_Ap_Ev_Dis_Pierre1 6783 0.41 -0.13
## 01_Ano_Leg-A_Ap_NA_Vie_Pierre2 5539 0.41 -0.13
## 02_Ano_Leg-A_Ap_NA_Pas_Paul 4831 0.41 -0.13
## 04_Ano_Leg-A_Ap_NA_Vie_Jean_Ev 4977 0.41 -0.13
## 05_Ano_Leg-A_Ap_NA_Vie_Jacques 18068 0.41 -0.50
## 10_Ano_Leg-A_Ap_NA_Vie_Barthelemy 4349 0.46 -0.15
## 06_Ano_Leg-A_Ap_NA_Vie_Matthieu 6490 0.46 -0.15
## 07_Ano_Leg-A_Ap_NA_Vie_SimonJude 6835 0.46 -0.15
## 08_Ano_Leg-A_Ap_NA_Vie_Philippe 1011 0.46 -0.37
## 09_Ano_Leg-A_Ap_NA_Vie_JacquesMineur 1350 0.46 -0.37
## 11_Ano_Leg-A_Ap_NA_Vie_Marc 1822 0.81 -0.21
## 12_Ano_Leg-A_Ma_Ho_Vie_Longin 2259 0.81 0.00
## 13_Ano_Leg-B_Ma_Ho_Vie_Sebastien 3543 0.81 -0.10
## 14_Ano_Leg-B_Ma_Ho_Vie_Vincent 4836 0.81 -0.18
## 15_Ano_Leg-B_Ma_Ho_Vie_Georges 4535 0.81 0.12
## 16_Ano_Leg-B_Ma_Ho_Vie_Christophe 9061 0.81 -0.10
## 17_Ano_Leg-B_Ma_Fe_Vie_Agathe 3122 0.81 -0.02
## 18_Ano_Leg-B_Ma_Fe_Vie_Luce 2395 0.81 0.05
## 19_Ano_Leg-B_Ma_Fe_Vie_Agnes 4212 0.81 0.05
## 20_Ano_Leg-B_Ma_Fe_Vie_Felicite 1675 0.81 0.01
## 21_Ano_Leg-B_Ma_Fe_Vie_Christine 7513 0.81 0.12
## 22_Ano_Leg-B_Ma_Fe_Vie_Cecile 6842 0.81 -0.10
## 23_Ano_Leg-B_Ma_Ho_Vie_Sixte 1890 0.81 0.01
## 24_Ano_Leg-B_Ma_Ho_Vie_Laurent 3255 0.81 0.01
## 25_Ano_Leg-B_Ma_Ho_Vie_Hippolyte 2538 0.81 0.01
## 28_Ano_Leg-B_Ma_Ho_Vie_Clement 2567 0.88 -0.40
## 46_Ano_Leg-B_Co_NA_Pur_Patrice 7885 0.88 -0.02
## 47_Ano_Leg-C_Co_er_Vie_PaulErmite 3773 0.88 -0.02
## 48_Ano_Leg-C_Co_ev_Tra_Benoit2 3276 0.88 0.57
## 49_Ano_Leg-C_NA_NA_Vie_Maur 6340 0.88 0.57
## 50_Ano_Leg-C_NA_NA_Vie_Placide 2798 0.88 0.57
## 51_Ano_Leg-C_Ma_ho_Vie_Eustache 3134 0.88 0.02
## 52_Ano_Leg-C_Co_NA_Vie_Fursi 2482 0.88 0.57
## 53_Ano_Leg-C_Vi_NA_Vie_Marguerite 1940 0.88 -0.01
## 54_Ano_Leg-C_Vi_NA_Vie_Pelagie 1520 0.88 0.26
## 55_Ano_Leg-C_Co_NA_Vie_Simeon 2894 0.88 0.57
## 56_Ano_Leg-C_Co_NA_Vie_Mamertin 2225 0.88 0.26
## 57_Ano_Leg-C_Vi_NA_Vie_Julien 2797 0.88 0.26
## 58_Ano_Leg-C_Vi_NA_Vie_MarieEgyptienne 5588 0.88 0.57
## 59_Ano_Leg-C_Vi_NA_Vie_Euphrasie 1287 0.88 -0.02
## 26_Ano_Leg-B_Ma_Ev_Vie_Lambert 5271 1.00 -0.07
## 29_Wau_Leg-C_Co_Ev_Vie_Martin 14458 1.00 0.57
## 31_Wau_Leg-C_Co_Ev_Dia_Martin3 18981 1.00 0.57
## 32_Wau_Leg-C_Co_Ev_Vie_Brice 1395 1.00 -0.06
## 33_Wau_Leg-C_Co_Er_Vie_Gilles 4433 1.00 0.57
## 34_Wau_Leg-C_Co_Ev_Vie_Martial 15299 1.00 -0.05
## 35_Wau_Leg-C_Co_Ev_Vie_Nicolas 1977 1.00 -0.05
## 36_Wau_Leg-C_Co_Ev_Mir_Nicolas2 10532 1.00 0.57
## 37_Wau_Leg-C_Co_Ev_Tra_Nicolas3 8403 1.00 0.57
## 38_Wau_Leg-C_Co_Ev_Vie_Jerome 2436 1.00 -0.03
## 39_Wau_Leg-C_Co_Ev_Vie_Benoit 12827 1.00 0.57
## 40_Wau_Leg-C_Co_Er_Vie_Alexis 4120 1.00 -0.28
# First, transform data
volRegr = rbind( data.frame(NWords = volRef[, 3], V_i = volRef[, 2], type = "Ref"),
data.frame(NWords = volSuppl[, 3], V_i = volSuppl[, 2], type = "Suppl"))
library(ggpmisc)
##
## Attaching package: 'ggpmisc'
## The following object is masked from 'package:ggplot2':
##
## annotate
ggplot(volRegr, aes(NWords, V_i, shape=type, colour=type, fill=type)) + geom_smooth(method="lm") +
geom_point(size=3) + theme_bw() +
# ggpmisc::stat_poly_eq(formula = quote(V_i) ~ quote(NWords), aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")), parse = TRUE)
ggpmisc::stat_fit_glance(method = 'lm', aes(label = paste0('p = ', round(..p.value.., 3), " Adj. R² = ", round(..adj.r.squared.., 3))))
## `geom_smooth()` using formula 'y ~ x'
vol = volatility(cahList, k = 5)
out = merge(round(vol, digits = 2), nwords, by="row.names", all.x=TRUE, all.y=FALSE)
out[order(out[, "V_i"]), ]
## Row.names V_i y
## 5 05_Ano_Leg-A_Ap_NA_Vie_Jacques -0.50 18068
## 28 28_Ano_Leg-B_Ma_Ho_Vie_Clement -0.40 2567
## 8 08_Ano_Leg-A_Ap_NA_Vie_Philippe -0.37 1011
## 9 09_Ano_Leg-A_Ap_NA_Vie_JacquesMineur -0.37 1350
## 44 45_Ano_Leg-C_Ap_NA_Pas_Andre2 -0.33 13349
## 39 40_Wau_Leg-C_Co_Er_Vie_Alexis -0.28 4120
## 59 60_Ano_Leg-B_NA_NA_NA_Antechriste -0.28 1501
## 27 27_Ano_Leg-B_Ma_Ho_Vie_Pantaleon -0.22 6563
## 11 11_Ano_Leg-A_Ap_NA_Vie_Marc -0.21 1822
## 14 14_Ano_Leg-B_Ma_Ho_Vie_Vincent -0.18 4836
## 41 42_Ano_Leg-B_Vi_NA_Ass_NotreDame -0.18 3119
## 6 06_Ano_Leg-A_Ap_NA_Vie_Matthieu -0.15 6490
## 7 07_Ano_Leg-A_Ap_NA_Vie_SimonJude -0.15 6835
## 10 10_Ano_Leg-A_Ap_NA_Vie_Barthelemy -0.15 4349
## 1 00_Ano_Leg-A_Ap_Ev_Dis_Pierre1 -0.13 6783
## 2 01_Ano_Leg-A_Ap_NA_Vie_Pierre2 -0.13 5539
## 3 02_Ano_Leg-A_Ap_NA_Pas_Paul -0.13 4831
## 4 04_Ano_Leg-A_Ap_NA_Vie_Jean_Ev -0.13 4977
## 42 43_Ano_Leg-C_Vi_NA_Vie_Catherine -0.13 8930
## 43 44_Ano_Leg-C_Ap_NA_Vie_Andre -0.13 3129
## 40 41_Ano_Leg-C_Vi_NA_Vie_Irene -0.11 3176
## 13 13_Ano_Leg-B_Ma_Ho_Vie_Sebastien -0.10 3543
## 16 16_Ano_Leg-B_Ma_Ho_Vie_Christophe -0.10 9061
## 22 22_Ano_Leg-B_Ma_Fe_Vie_Cecile -0.10 6842
## 26 26_Ano_Leg-B_Ma_Ev_Vie_Lambert -0.07 5271
## 31 32_Wau_Leg-C_Co_Ev_Vie_Brice -0.06 1395
## 33 34_Wau_Leg-C_Co_Ev_Vie_Martial -0.05 15299
## 34 35_Wau_Leg-C_Co_Ev_Vie_Nicolas -0.05 1977
## 37 38_Wau_Leg-C_Co_Ev_Vie_Jerome -0.03 2436
## 17 17_Ano_Leg-B_Ma_Fe_Vie_Agathe -0.02 3122
## 45 46_Ano_Leg-B_Co_NA_Pur_Patrice -0.02 7885
## 46 47_Ano_Leg-C_Co_er_Vie_PaulErmite -0.02 3773
## 58 59_Ano_Leg-C_Vi_NA_Vie_Euphrasie -0.02 1287
## 52 53_Ano_Leg-C_Vi_NA_Vie_Marguerite -0.01 1940
## 12 12_Ano_Leg-A_Ma_Ho_Vie_Longin 0.00 2259
## 20 20_Ano_Leg-B_Ma_Fe_Vie_Felicite 0.01 1675
## 23 23_Ano_Leg-B_Ma_Ho_Vie_Sixte 0.01 1890
## 24 24_Ano_Leg-B_Ma_Ho_Vie_Laurent 0.01 3255
## 25 25_Ano_Leg-B_Ma_Ho_Vie_Hippolyte 0.01 2538
## 50 51_Ano_Leg-C_Ma_ho_Vie_Eustache 0.02 3134
## 18 18_Ano_Leg-B_Ma_Fe_Vie_Luce 0.05 2395
## 19 19_Ano_Leg-B_Ma_Fe_Vie_Agnes 0.05 4212
## 15 15_Ano_Leg-B_Ma_Ho_Vie_Georges 0.12 4535
## 21 21_Ano_Leg-B_Ma_Fe_Vie_Christine 0.12 7513
## 53 54_Ano_Leg-C_Vi_NA_Vie_Pelagie 0.26 1520
## 55 56_Ano_Leg-C_Co_NA_Vie_Mamertin 0.26 2225
## 56 57_Ano_Leg-C_Vi_NA_Vie_Julien 0.26 2797
## 29 29_Wau_Leg-C_Co_Ev_Vie_Martin 0.57 14458
## 30 31_Wau_Leg-C_Co_Ev_Dia_Martin3 0.57 18981
## 32 33_Wau_Leg-C_Co_Er_Vie_Gilles 0.57 4433
## 35 36_Wau_Leg-C_Co_Ev_Mir_Nicolas2 0.57 10532
## 36 37_Wau_Leg-C_Co_Ev_Tra_Nicolas3 0.57 8403
## 38 39_Wau_Leg-C_Co_Ev_Vie_Benoit 0.57 12827
## 47 48_Ano_Leg-C_Co_ev_Tra_Benoit2 0.57 3276
## 48 49_Ano_Leg-C_NA_NA_Vie_Maur 0.57 6340
## 49 50_Ano_Leg-C_NA_NA_Vie_Placide 0.57 2798
## 51 52_Ano_Leg-C_Co_NA_Vie_Fursi 0.57 2482
## 54 55_Ano_Leg-C_Co_NA_Vie_Simeon 0.57 2894
## 57 58_Ano_Leg-C_Vi_NA_Vie_MarieEgyptienne 0.57 5588
# see if there is a correlation
reg = lm(out[, 3] ~ out[, 2])
summary(reg)
##
## Call:
## lm(formula = out[, 3] ~ out[, 2])
##
## Residuals:
## Min 1Q Median 3Q Max
## -4182 -2973 -1410 1874 13946
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5161.1 549.3 9.395 3.52e-13 ***
## out[, 2] 2078.9 1804.1 1.152 0.254
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4171 on 57 degrees of freedom
## Multiple R-squared: 0.02276, Adjusted R-squared: 0.00562
## F-statistic: 1.328 on 1 and 57 DF, p-value: 0.254
plot(out[, 2], out[, 3])
abline(reg)
# Et la distrib des VI
boxplot(out[, 2])
hist(out[, 2])
RefcahList = list(raw3grams = CAHRaw3gr, Affs = CAHAffs, FunctWords = CAHFW, FunctLemm = CAHFL, POS3gr = CAHPOS3gr, FWPOSandAffs = CAHGlob2, Forms = CAHForms, Lemmas = CAHLemmas, WordsLemmas = CAHWordsLemmas)
##CAREFUL ###
####TEMPORARY FIX - DO REMOVE ME LATER#####
#toKeepBis = toKeep[!toKeep == "60_Ano_Leg-B_NA_NA_NA_Antechriste"]
# Redo base results without Antechrist
#RefcahListBis = replicateAnalysis(toKeepBis, "data/transkr_raw_char3grams.csv", "data/transkr_expanded_words.csv", "data/transkr_pos3-gr.csv", "data/transkr_lemmas.csv", functionWords, functionLemmas)
# 1. get Students analysis list
#StudentsResults = replicateAnalysis(toKeepBis, "data/transkr_student_raw_char3grams.csv", "data/transkr_student_expanded_words.csv", "data/transkr_student_pos3-gr.csv", "data/transkr_student_lemmas.csv", functionWords, functionLemmas)
StudentsResults = replicateAnalysis(toKeep, "data/transkr_student_raw_char3grams.csv", "data/transkr_student_expanded_words.csv", "data/transkr_student_pos3-gr.csv", "data/transkr_student_lemmas.csv", functionWords, functionLemmas)
# 2. perform comparison
comp1 = compareReplications(RefcahList, StudentsResults, k = 5)
# 3. get Kraken analysis list
KrakenResults = replicateAnalysis(toKeep, "data/kraken_nospace_raw_char3grams.csv", "data/kraken_nospace_expanded_words.csv", "data/kraken_nospace_pos3-gr.csv", "data/kraken_nospace_lemmas.csv", functionWords, functionLemmas)
# 4. perform comparison
comp2 = compareReplications(RefcahList, KrakenResults, k = 5)
# 5. merge and output
out = cbind(comp1, comp2)
colnames(out) = c("Students", "Kraken")
refs = c("raw3grams", "FWPOSandAffs", "WordsLemmas")
suppl = rownames(out)[!rownames(out) %in% refs]
# Geom mean all
out = rbind(out, exp(colMeans(log(out))))
# Geom mean refs
out = rbind(out, exp(colMeans(log(out[refs, ]))))
# Geom mean suppl
out = rbind(out, exp(colMeans(log(out[suppl, ]))))
rownames(out)[(length(rownames(out))-2):length(rownames(out))] = c("geom mean all", "geom mean refs", "geom mean suppl")
round(out, 2)
## Students Kraken
## raw3grams 1.00 0.86
## Affs 0.92 0.92
## FunctWords 0.90 0.75
## FunctLemm 0.83 0.78
## POS3gr 0.85 0.86
## FWPOSandAffs 0.90 0.98
## Forms 0.93 0.78
## Lemmas 0.80 0.66
## WordsLemmas 0.76 0.97
## geom mean all 0.87 0.83
## geom mean refs 0.88 0.94
## geom mean suppl 0.87 0.79